{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/.local/lib/python3.8/site-packages/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3361\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/ubuntu/.local/lib/python3.8/site-packages/malaya/tokenizer.py:202: FutureWarning: Possible nested set at position 3879\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "import malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ab473aea2dc64686bdcb25b62d6f9bbf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading:   0%|          | 0.00/397M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-11-20 11:50:56.251428: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2022-11-20 11:50:56.263811: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
      "2022-11-20 11:50:56.263837: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop\n",
      "2022-11-20 11:50:56.263840: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop\n",
      "2022-11-20 11:50:56.263883: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.141.3\n",
      "2022-11-20 11:50:56.263896: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.141.3\n",
      "2022-11-20 11:50:56.263899: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.141.3\n"
     ]
    }
   ],
   "source": [
    "model = malaya.tatabahasa.transformer_tagging(model = 'small')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "data = []\n",
    "with open('test.jsonl') as fopen:\n",
    "    for l in fopen:\n",
    "        data.append(json.loads(l))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "\n",
    "def compute_exact(a_gold, a_pred):\n",
    "    return int(a_gold == a_pred)\n",
    "\n",
    "\n",
    "def compute_f1(a_gold, a_pred):\n",
    "    gold_toks = a_gold\n",
    "    pred_toks = a_pred\n",
    "    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)\n",
    "    num_same = sum(common.values())\n",
    "    if len(gold_toks) == 0 or len(pred_toks) == 0:\n",
    "        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise\n",
    "        return int(gold_toks == pred_toks)\n",
    "    if num_same == 0:\n",
    "        return 0\n",
    "    precision = 1.0 * num_same / len(pred_toks)\n",
    "    recall = 1.0 * num_same / len(gold_toks)\n",
    "    f1 = (2 * precision * recall) / (precision + recall)\n",
    "    return f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Dia', 11), ('tak', 2), ('suka', 2)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "o = model.greedy_decoder(['saya tak suka'])[0]\n",
    "o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Dia', 'tak', 'suka']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p_y = [o_[0] for o_ in o]\n",
    "p_t = [o_[1] for o_ in o]\n",
    "p_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 70%|████████████████████████████████████████████████████████████████▋                           | 3514/4994 [10:44<04:55,  5.00it/s]"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "exact_match, f1, exact_match_tag, f1_tag = [], [], [], []\n",
    "for i in tqdm(range(len(data))):\n",
    "    x = ' '.join([d[0] for d in data[i][1]])\n",
    "    y = [d[0] for d in data[i][0]]\n",
    "    tag = [d[1] for d in data[i][1]]\n",
    "    \n",
    "    o = model.greedy_decoder([x])[0]\n",
    "    \n",
    "    p_y = [o_[0] for o_ in o]\n",
    "    p_t = [o_[1] for o_ in o]\n",
    "    \n",
    "    exact_match.append(compute_exact(y, p_y))\n",
    "    exact_match_tag.append(compute_exact(tag, p_t))\n",
    "    f1.append(compute_f1(y, p_y))\n",
    "    f1_tag.append(compute_f1(tag, p_t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.7891067538126362, 0.8836601307189542)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(exact_match), np.mean(exact_match_tag)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.9678974144774667, 0.9831785594259043)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(f1), np.mean(f1_tag)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
